# URL_dataCoding.R
# John G. Bullock and Aaron L. Gertler
# 2016 August 02

# This file imports and codes data on URLs that have been published in the 
# APSR.  It is called by most of the R files that we used to create figures
# for our article,
#
#   Gertler, Aaron L., and John G. Bullock.  2016.  "Reference Rot: An 
#   Emerging Threat to Transparency in Political Science."  PS: Political 
#   Science and Politics.

library(car)      # for Recode()

# CREATE QW() FUNCTION FOR EASY STRING HANDLING
# For example, qw("doing the rhododenderons") is equivalent to the clunkier
# c("doing", "the", "rhododenderons").
qw <- function(x) unlist(strsplit(x, "[[:space:]]+"))



##############################################################################
# IMPORT AND CLEAN THE DATA
##############################################################################
URL_data <- read.csv(
  file       = 'URL_data.csv',
  header     = TRUE,
  col.names  = qw("year    issue   page    URL       URL_type US_fedGov referent result2014 result2016"),
  colClasses = qw("integer integer integer character factor   integer   factor   factor     factor"))
URL_data$US_fedGov <- URL_data$US_fedGov == 1 

articlesPerIssue <- read.csv(
  file      = 'APSR_articlesPerIssue.csv',
  header    = TRUE,
  col.names = qw("year issue numArticles"))
articlesPerYear <- tapply(articlesPerIssue$numArticles, articlesPerIssue$year, sum)

# Change labels in "referent" variable, then re-order the levels.
URL_data$referent <- Recode(
  var      = URL_data$referent, 
  recodes  = '"Replication"="Reproducibility"; "Bibliographic sources"="Bibliographic source"')
URL_data$referent <- factor(
  x      = URL_data$referent, 
  levels = c("Reproducibility", "Database", "Bibliographic source"))

# Process URLs, e.g., so that trivial features like a trailing slash don't
# make the same URLs seem different in the data
URL_data$URL_originalCase <- URL_data$URL
URL_data$URL <- tolower(URL_data$URL)
URL_data$URL <- sub('/$',       '', URL_data$URL)     # remove trailing slash
URL_data$URL <- sub('^http://', '', URL_data$URL)  
URL_data$URL <- sub('^www\\.',  '', URL_data$URL)

# Fix small problems with the coding
URL_data$result2014 <- Recode(
  var     = URL_data$result2014, 
  recodes = '"information found"="Information found"')
URL_data$result2016 <- Recode(
  var     = URL_data$result2016, 
  recodes = '"Information unavailable but page was found"="Page found but information unavailable"')
URL_data$result2016 <- gsub(':', '', URL_data$result2016, fixed = TRUE)
URL_data$result2016 <- Recode(
  var       = URL_data$result2016, 
  recodes   = '"Information unavailable page not found"="Page not found"',
  as.factor = TRUE)
URL_data$URL_type   <- Recode(
  var     = URL_data$URL_type,   
  recodes = '"institutional site"="Institutional site"')

# Collapse "personal site" URL_type categories
table(URL_data$URL_type)
URL_data$URL_type <- Recode(
  var     = URL_data$URL_type,
  recodes = 'c("Personal site (university)", "Personal site (non-university)") = "Personal site"')



###############################################################################
# EXAMINE A RANDOM SAMPLE OF 100 BROKEN REPRODUCIBILITY LINKS
###############################################################################
brokenLinkAnalysis_results <- read.csv(
  file       = 'brokenReproducibilityURLs.csv',
  header     = TRUE,
  col.names  = qw("year    issue   page    URL       result"),
  colClasses = qw("integer integer integer character factor"))
table(brokenLinkAnalysis_results$result)
